{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "75347845",
   "metadata": {},
   "source": [
    "### Test SV carrier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "f53fa643",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "import numpy as np\n",
    "import pysam \n",
    "from pybedtools import BedTool\n",
    "from pathlib import Path\n",
    "from io import StringIO\n",
    "from functools import reduce"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "073b1601",
   "metadata": {},
   "outputs": [],
   "source": [
    "wkdir = \"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3/\"\n",
    "wkdir_path = Path(wkdir)\n",
    "ge_matrix_flat_path = wkdir_path.joinpath(\"2_misexp_qc/misexp_gene_cov_corr/tpm_zscore_4568smpls_8610genes_flat_misexp_corr_qc.csv\")\n",
    "chrom = \"chr21\"\n",
    "sv_info_path = \"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/lof_missense/data/sv_vcf/info_table/final_sites_critical_info_allele.txt\"\n",
    "vcf_path = \"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/lof_missense/data/sv_vcf/filtered_merged_gs_svp_10728.vcf.gz\"\n",
    "gencode_path = \"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3/reference/gencode/gencode.v31.annotation.sorted.gtf.gz\"\n",
    "wgs_rna_paired_smpls_path = wkdir_path.joinpath(\"1_rna_seq_qc/wgs_rna_match/paired_wgs_rna_postqc_prioritise_wgs.tsv\")\n",
    "vep_msc_path = wkdir_path.joinpath(\"4_vrnt_enrich/sv_vep/msc/SV_vep_hg38_msc_parsed.tsv\")\n",
    "window = 200000\n",
    "root_dir = wkdir_path.joinpath(\"4_vrnt_enrich/sv_count_carriers/test/gene_body/200kb_window\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "69be08a3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Inputs:\n",
      "- Chromosome: chr21\n",
      "- Window size: 200000bp\n",
      "\n",
      "Gene IDs passing filters: 8610\n",
      "RNA-seq sample IDs passing QC: 4568\n",
      "\n",
      "Loading input VCF ...\n",
      "VCF loaded.\n",
      "Subset VCF to samples with RNA-seq ...\n",
      "Number of samples in VCF with RNA ID and passing QC: 2640\n",
      "Number of RNA IDs passing QC: 2640\n",
      "Number of genes pass QC on chr21: 148\n",
      "Sample IDs in gene expression matrix with SV calls: 2640\n",
      "Number of variants intersecting gene windows: 1131\n"
     ]
    }
   ],
   "source": [
    "print(\"Inputs:\")\n",
    "print(f\"- Chromosome: {chrom}\")\n",
    "print(f\"- Window size: {window}bp\")\n",
    "print(\"\")\n",
    "# create root directory \n",
    "root_dir_path = Path(root_dir)\n",
    "root_dir_path.mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "### Collect samples with VCF calls and RNA sequencing \n",
    "# read in gene expression file \n",
    "ge_matrix_flat_df = pd.read_csv(ge_matrix_flat_path, sep=\",\")\n",
    "gene_id_pass_qc_set = set(ge_matrix_flat_df.gene_id.unique())\n",
    "print(f\"Gene IDs passing filters: {len(gene_id_pass_qc_set)}\")\n",
    "smpl_id_pass_qc_set = set(ge_matrix_flat_df.rna_id.unique())\n",
    "print(f\"RNA-seq sample IDs passing QC: {len(smpl_id_pass_qc_set)}\")\n",
    "print(\"\")\n",
    "# egan ID, RNA ID sample links \n",
    "wgs_rna_paired_smpls_df = pd.read_csv(wgs_rna_paired_smpls_path, sep=\"\\t\")\n",
    "egan_ids_with_rna = wgs_rna_paired_smpls_df[wgs_rna_paired_smpls_df.rna_id.isin(smpl_id_pass_qc_set)].egan_id.tolist()\n",
    "# load VCF and subset to EGAN IDs with RNA \n",
    "print(\"Loading input VCF ...\")\n",
    "vcf_path = vcf_path\n",
    "vcf = pysam.VariantFile(vcf_path, mode = \"r\")\n",
    "print(\"VCF loaded.\")\n",
    "print(\"Subset VCF to samples with RNA-seq ...\")\n",
    "vcf_samples = [sample for sample in vcf.header.samples]\n",
    "vcf_egan_ids_with_rna = set(egan_ids_with_rna).intersection(set(vcf_samples))\n",
    "vcf.subset_samples(vcf_egan_ids_with_rna)\n",
    "vcf_samples_with_rna = [sample for sample in vcf.header.samples]\n",
    "print(f\"Number of samples in VCF with RNA ID and passing QC: {len(vcf_samples_with_rna)}\")\n",
    "# subset egan ID and RNA ID links to samples with SV calls and passing QC \n",
    "wgs_rna_paired_smpls_with_sv_calls_df = wgs_rna_paired_smpls_df[wgs_rna_paired_smpls_df.egan_id.isin(vcf_samples_with_rna)]\n",
    "# write EGAN-RNA ID pairs to file\n",
    "egan_rna_smpls_dir = root_dir_path.joinpath(\"egan_rna_smpls\")\n",
    "Path(egan_rna_smpls_dir).mkdir(parents=True, exist_ok=True)\n",
    "wgs_rna_paired_smpls_with_sv_calls_df.to_csv(egan_rna_smpls_dir.joinpath(\"egan_rna_ids_paired_pass_qc.tsv\"), sep=\"\\t\", index=False)\n",
    "rna_id_pass_qc_sv_calls = wgs_rna_paired_smpls_with_sv_calls_df.rna_id.unique().tolist()\n",
    "print(f\"Number of RNA IDs passing QC: {len(rna_id_pass_qc_sv_calls)}\")\n",
    "\n",
    "### write bed file for genes on chromosome passing QC \n",
    "gene_bed_dir = root_dir_path.joinpath(\"genes_bed\")\n",
    "gene_bed_dir.mkdir(parents=True, exist_ok=True)\n",
    "gene_bed_path = gene_bed_dir.joinpath(f\"{chrom}_genes.bed\")\n",
    "gene_id_pass_qc_on_chrom = []\n",
    "with open(gene_bed_path, \"w\") as f:\n",
    "    for gtf in pysam.TabixFile(gencode_path).fetch(chrom, parser = pysam.asGTF()):\n",
    "        if gtf.feature == \"gene\" and gtf.gene_id.split('.')[0] in gene_id_pass_qc_set:\n",
    "            # check for multiple entries with same name \n",
    "            gene_id_list, chrom_list, start_list, end_list = [], [], [], []\n",
    "            gene_id_list.append(gtf.gene_id.split('.')[0])\n",
    "            chrom_list.append(gtf.contig)\n",
    "            start_list.append(gtf.start)\n",
    "            end_list.append((gtf.end))\n",
    "            # check or write output\n",
    "            if len(gene_id_list) > 1 or len(chrom_list) > 1 or len(start_list) > 1 or len(end_list) > 1:\n",
    "                print(f\"{gene_id} has multiple entries in gencode file - excluded from output file.\")\n",
    "            elif len(chrom_list) == 0 or len(start_list) == 0 or len(end_list) == 0: \n",
    "                print(f\"{gene_id} has no entries in gencode file - excluded from output file.\")\n",
    "            else: \n",
    "                gene_id, gtf_chrom, start, end = gene_id_list[0], chrom_list[0], start_list[0], end_list[0]\n",
    "                gene_id_pass_qc_on_chrom.append(gene_id)\n",
    "                chrom_num = gtf_chrom.split(\"chr\")[1]\n",
    "                f.write(f\"{chrom_num}\\t{start}\\t{end}\\t{gene_id}\\n\")\n",
    "print(f\"Number of genes pass QC on {chrom}: {len(gene_id_pass_qc_on_chrom)}\")\n",
    "\n",
    "# subset gene expression file to genes on chromosome \n",
    "ge_matrix_flat_chrom_df = ge_matrix_flat_df[ge_matrix_flat_df.gene_id.isin(gene_id_pass_qc_on_chrom)]\n",
    "# subset gene expression file to samples with SV calls \n",
    "ge_matrix_flat_chrom_egan_df = pd.merge(ge_matrix_flat_chrom_df, wgs_rna_paired_smpls_with_sv_calls_df, how=\"inner\", on=\"rna_id\")\n",
    "print(f\"Sample IDs in gene expression matrix with SV calls: {len(ge_matrix_flat_chrom_egan_df.egan_id.unique())}\")\n",
    "# write to file \n",
    "ge_matrix_flat_chrom_egan_dir = root_dir_path.joinpath(\"express_mtx\")\n",
    "Path(ge_matrix_flat_chrom_egan_dir).mkdir(parents=True, exist_ok=True)\n",
    "ge_matrix_flat_chrom_egan_df.to_csv(ge_matrix_flat_chrom_egan_dir.joinpath(f\"{chrom}_ge_mtx_flat.tsv\"), sep=\"\\t\", index=False)\n",
    "\n",
    "### write SVs on chromosome to bed file \n",
    "vrnts_bed_dir = root_dir_path.joinpath(\"vrnts_bed\")\n",
    "vrnts_bed_dir.mkdir(parents=True, exist_ok=True)\n",
    "vrnts_bed_path = vrnts_bed_dir.joinpath(f\"{chrom}_vrnts.bed\")\n",
    "with open(sv_info_path, \"r\") as f_in, open(vrnts_bed_path, \"w\") as f_out:\n",
    "    for line in f_in:\n",
    "        if line.startswith(\"plinkID\"): \n",
    "            continue\n",
    "        else: \n",
    "            vrnt_id, sv_chrom, pos, end = line.split(\"\\t\")[0], line.split(\"\\t\")[2], line.split(\"\\t\")[3], line.split(\"\\t\")[4]\n",
    "            if sv_chrom == chrom:\n",
    "                chrom_num = sv_chrom.split(\"chr\")[1]\n",
    "                f_out.write(f\"{chrom_num}\\t{pos}\\t{end}\\t{vrnt_id}\\n\")\n",
    "\n",
    "### intersect variants with gene windows \n",
    "intersect_bed_dir = root_dir_path.joinpath(\"intersect_bed\")\n",
    "intersect_bed_dir.mkdir(parents=True, exist_ok=True)\n",
    "intersect_bed_path = intersect_bed_dir.joinpath(f\"{chrom}_intersect.bed\")\n",
    "\n",
    "vrnts_bed = BedTool(vrnts_bed_path)\n",
    "gencode_bed= BedTool(gene_bed_path)\n",
    "\n",
    "intersect_bed_str = StringIO(str(gencode_bed.window(vrnts_bed, w=window)))\n",
    "columns={0:\"chrom_gene\", 1:\"start_gene\", 2:\"end_gene\", 3: \"gene_id\", \n",
    "         4:\"chrom_vrnt\", 5:\"start_vrnt\", 6:\"end_vrnt\", 7:\"vrnt_id\"}\n",
    "intersect_bed_df = pd.read_csv(intersect_bed_str, \n",
    "                               sep=\"\\t\", \n",
    "                               header=None).rename(columns=columns)\n",
    "intersect_bed_df.to_csv(intersect_bed_path, sep=\"\\t\", index=False, header=None)\n",
    "# generate set of variant IDs inside gene windows\n",
    "intersect_vrnt_ids_no_dupl_df = intersect_bed_df[[\"chrom_vrnt\", \"vrnt_id\", \"start_vrnt\", \"end_vrnt\"]].drop_duplicates()\n",
    "intersect_vrnt_ids_list = intersect_vrnt_ids_no_dupl_df.vrnt_id.unique()\n",
    "num_intersect_vrnt_ids = len(intersect_vrnt_ids_list)\n",
    "print(f\"Number of variants intersecting gene windows: {num_intersect_vrnt_ids}\")\n",
    "\n",
    "### Genotypes for all variant IDs in windows\n",
    "count = 0\n",
    "vrnt_gt_egan_dict = {}\n",
    "for vrnt_id in intersect_vrnt_ids_list:\n",
    "    # get chromosome, start and end of SV \n",
    "    chrom_vrnt, pos, end, = [intersect_vrnt_ids_no_dupl_df[intersect_vrnt_ids_no_dupl_df.vrnt_id == vrnt_id][col].item() for col in [\"chrom_vrnt\", \"start_vrnt\", \"end_vrnt\"]]\n",
    "    # search VCF for variant ID\n",
    "    records = vcf.fetch(str(chrom_vrnt), pos-1, end)\n",
    "    found_vrnt_id = False\n",
    "    for rec in records: \n",
    "        vcf_vrnt_id = str(rec.id)\n",
    "        if vrnt_id == vcf_vrnt_id:\n",
    "            found_vrnt_id = True \n",
    "            # collect genotypes\n",
    "            gts = [s[\"GT\"] for s in rec.samples.values()]\n",
    "            for i, gt in enumerate(gts): \n",
    "                vrnt_gt_egan_dict[count] = [vrnt_id, vcf_samples_with_rna[i], gt]\n",
    "                count += 1 \n",
    "    if not found_vrnt_id: \n",
    "        raise ValueError(f\"Did not find {vrnt_id} in {vcf_path}\")\n",
    "vrnt_gt_egan_nogene_df = pd.DataFrame.from_dict(vrnt_gt_egan_dict, \n",
    "                                                orient=\"index\", \n",
    "                                                columns=[\"vrnt_id\", \"egan_id\", \"genotype\"])\n",
    "if vrnt_gt_egan_nogene_df.shape[0] != num_intersect_vrnt_ids * len(rna_id_pass_qc_sv_calls):\n",
    "    raise ValueError(\"Number of variant genotypes does not much variant number by samples.\")\n",
    "\n",
    "### add intersecting gene, SV info, VEP MSC and expression\n",
    "# load SV info\n",
    "sv_info_df = pd.read_csv(sv_info_path, sep=\"\\t\", dtype={\"plinkID\":str})\n",
    "sv_info_id_af_df = sv_info_df[[\"plinkID\", \"AF\", \"SVTYPE\"]].rename(columns={\"plinkID\":\"vrnt_id\"})\n",
    "# load most severe consequence \n",
    "vep_msc_df = pd.read_csv(vep_msc_path, sep=\"\\t\", dtype={\"vrnt_id\": str})\n",
    "msc_list = vep_msc_df.Consequence.unique().tolist()\n",
    "vep_msc_cnsqn_df = vep_msc_df.rename(columns={\"Uploaded_variation\": \"vrnt_id\"})[[\"vrnt_id\", \"Consequence\"]]\n",
    "# merge \n",
    "dfs_to_merge = [vrnt_gt_egan_nogene_df, \n",
    "                intersect_bed_df[[\"vrnt_id\",\"gene_id\"]].drop_duplicates(),\n",
    "                sv_info_id_af_df, \n",
    "                vep_msc_cnsqn_df\n",
    "               ]\n",
    "df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['vrnt_id'],\n",
    "                                            how='inner'), dfs_to_merge)\n",
    "# add expression \n",
    "sv_intersect_express_info_df = pd.merge(df_merged, \n",
    "                                     ge_matrix_flat_chrom_egan_df,                  \n",
    "                                     how=\"inner\",\n",
    "                                     on=[\"egan_id\", \"gene_id\"])\n",
    "# add window name\n",
    "sv_intersect_express_info_df[\"window\"] = f\"gene_body_{window}\"\n",
    "# only keep carriers \n",
    "sv_intersect_express_info_carrier_df = sv_intersect_express_info_df[sv_intersect_express_info_df.genotype.isin([(0, 1), (1, 1)])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "46ae2001",
   "metadata": {},
   "outputs": [],
   "source": [
    "# write genotypes for SVs  \n",
    "express_gt_info_dir = root_dir_path.joinpath(\"express_carrier_info\")\n",
    "express_gt_info_dir.mkdir(parents=True, exist_ok=True)\n",
    "intersect_bed_path = express_gt_info_dir.joinpath(f\"{chrom}_express_carrier_info.tsv\")\n",
    "sv_intersect_express_info_carrier_df.to_csv(intersect_bed_path, sep=\"\\t\", index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
